package com.lucene.crawler.core;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.lucene.crawler.bean.LinkTypeData;
import com.lucene.crawler.rule.Rule;
import com.lucene.crawler.rule.RuleException;
import com.lucene.crawler.util.TextUtil;

public class GrabService {
	
	public static List<LinkTypeData> grab(Rule rule) {
		
		validateRule(rule);
		
		List<LinkTypeData> datas = new ArrayList<LinkTypeData>();
		LinkTypeData data = null;
		
		try {
			String url = rule.getUrl();
			String[] params = rule.getParams();
			String[] values = rule.getValues();
			String resultTagName = rule.getResultTagName();
			int type = rule.getType();
			int requestType = rule.getRequestMethod();
			
			Connection conn = Jsoup.connect(url);
			
			// 设置抓取数据参数
			if(params != null) {
				for(int i = 0; i < params.length; i++) {
					conn.data(params[i], values[i]);
				}
			}
			
			// 设置请求类型
			Document doc = null;
			switch(requestType) {
				case Rule.GET:
					doc = conn.timeout(100000).get();
					break;
				case Rule.POST:
					doc = conn.timeout(100000).post();
					break;
			}
		
			// 处理返回的结果
			Elements results = new Elements();
			
			switch(type) {
				case Rule.CLASS: 
					results = doc.getElementsByClass(resultTagName);
					break;
				case Rule.ID:
					Element result = doc.getElementById(resultTagName);
					results.add(result);
					break;
				case Rule.SELECTION:
					results = doc.select(resultTagName);
					break;
				default:
					// 当resultTagName为空时默认取body标签
					if(TextUtil.isEmpty(resultTagName)) {
						results = doc.getElementsByTag("body");
					}
			}
			
			// 解析a标签
			for (int i = 0; i < results.size(); i++) {
				Element result = results.get(i);
				Elements links = result.getElementsByTag("a");
				
				// 必要的筛选
				for(int j = 0; j < links.size(); j++) {
					Element link = links.get(i);
					String linkHref = link.attr("href");
					String linkTitle = link.text();
					
					data = new LinkTypeData();
					data.setLinkHref(linkHref);
					data.setLinkTitle(linkTitle);
					
					datas.add(data);
				}
			}
		
		} catch (IOException e) {
			e.printStackTrace();
		}
		
		return datas;
	}
	
	/**
	 * 对传入的参数作必要的校验
	 * @param rule
	 */
	private static void validateRule(Rule rule) {
		
		String url = rule.getUrl();
		
		if(TextUtil.isEmpty(url)) {
			throw new RuleException("url不能为空!");
		}
		
		if(!url.startsWith("http://") && !url.startsWith("https://")) {
			throw new RuleException("url格式不正确!");
		}
		
		if(rule.getParams() != null && rule.getValues() != null) {
			if(rule.getParams().length != rule.getValues().length) {
				throw new RuleException("参数的键值对个数不匹配!");
			}
		}
	}
	
}
